# ARRIVAL MODEL AND BOOTSTRAPPING CODE #

# libraries
library(dplyr)
library(mgcv)
setwd("Pied Flycatcher Phenology/Interval estimation/Arrival/")

# read in arrival data
piedflycatcher <- read.csv("data/BT_piedflycatcher.csv", sep = ",")[,-1]
# this dataset contains the presences and absences for the different northings and eastings
# as well as landcover variables, duration of the visit, ID of the observer
head(piedflycatcher)

#### GENERAL ARRIVAL MODEL ####

# function for model fitting on arrival data
fullgam <- function (type = "ts", type_te = type, k_joint = 5, degrees = 5,  minday = 60, maxday = 196, yearx = NA,
                     defdata=FALSE,subdataset=NA)
  
{
  start.time <- Sys.time() # Start the clock
  
  # main model
  if(defdata==FALSE) btdata <- piedflycatcher
  # when fitting the model to bootstrapped sets
  if(defdata==TRUE) btdata <- subdataset
  print (yearx)
  
  # filter dataset by year and trim out observations outside the selected period
  dataset <- filter(btdata, year == yearx & dayofyear >= minday & dayofyear <= maxday)
  dataset$ID <- dataset$ID[drop=TRUE] # remove unused observers
  
  # GAM
  mod <- gam(presence ~ te(dayofyear,northing, k = k_joint, bs=type_te)+s(broadleaf1, k = degrees, bs=type)+
               s(altitude1, k = degrees,bs=type)+s(urban1, k = degrees,bs=type)+s(suburban1, k = degrees,bs=type) +
               s(grassland, k = degrees,bs=type) +
               s(coniferous1, k = degrees,bs=type)+s(DURATION_HRS, k = degrees, bs=type),
             family = binomial(link = "logit"), method="REML",
             data = dataset
  )
  
  print (summary(mod))
  print (BIC(mod))
  print (paste0("k=", degrees,",bs=", type))
  
  end.time <- Sys.time()
  print (end.time - start.time)
  
  return(mod)
  
}

#### BOOTSTRAPPING ####

# get the first day that has a presence of over .5
first0.5 <- function(x){
  return (min(which(x >= 0.5)))
}

minday <- 60
maxday <- 196
# CODE TO RUN SET OF BOOSTRAPS #
# running time is lengthy, advised to run in parallel "
for (yr in c(2013:2016))
{
  speciesdata <- filter(piedflycatcher, year == yr & dayofyear >= minday & dayofyear <= maxday)
  dir.create(paste0(yr))
  for (boot in 1:200){
    
    partialstarttime <- Sys.time()
    print(boot)
    
    bootfolder <- paste0(yr,"/bt_",yr,"_boot_",boot,"/")
    dir.create(paste0(outputwd,bootfolder))
    
    set.seed(boot)
    
    bootdata <- sample_frac(speciesdata, size = 1, replace = TRUE)
    mod <- fullgam(k_joint = 5,degrees = 5, yearx = yr, defdata = TRUE, data = bootdata)
    
    # create prediction dataset
    day <- seq(minday, maxday, by=.1)
    north <- seq(floor(min(speciesdata$northing)/10000)*10000,floor(max(speciesdata$northing)/10000)*10000,10000)
    
    # set the prediction values at median
    predframe <- expand.grid(dayofyear=day, northing = north ,
                             broadleaf1 = median(speciesdata$broadleaf1), 
                             urban1 = median(speciesdata$urban1),
                             altitude1=median(speciesdata$altitude1,na.rm = TRUE), 
                             suburban1 = median(speciesdata$suburban1),
                             coniferous1 = median(speciesdata$coniferous1),
                             grassland = median(speciesdata$grassland),
                             DURATION_HRS = 1) 
    
    predL <- predict(mod, newdata=predframe, type="response", se.fit=T)
    predmatrix <- as.matrix(xtabs(predL$fit ~ predframe$dayofyear+predframe$northing))
    
    # standardizing by max at each latitude
    cmatrix <- t(predmatrix) #transposing
    maxvector <- apply(cmatrix,1, FUN = max)
    for (i in 1:nrow(cmatrix)){
      cmatrix[i,] <- cmatrix[i,]/maxvector[i]
    }
    
    # finding out the median arrival date
    colno <- apply(cmatrix, 1, FUN = first0.5)
    dayno <- as.numeric(colnames(cmatrix)[colno])
    
    # in case the bootstrapped data did not get samples from the edges
    maxnas <- which(as.numeric(rownames(cmatrix))> max(bootdata$northing))
    minnas <- which(as.numeric(rownames(cmatrix))< min(bootdata$northing))
    dayno[maxnas] <- NA
    dayno[minnas] <- NA
    colnos <- as.numeric(rownames(cmatrix))
    
    # saving results in adequate folders
    saveRDS(mod,file= paste0(outputwd,bootfolder,"modboot_53_203_",boot,".rds"))
    saveRDS(dayno, paste0(outputwd,bootfolder,"daynos_",boot,".rds")) # dates
    saveRDS(colnos, paste0(outputwd,bootfolder,"colnos_",boot,".rds"))
    
    partialendtime <- Sys.time()
    print (paste0("boot ",boot," took ",round(partialendtime - partialstarttime,4)," seconds to run"))
  }
  
}